Key Points
14,744 patients had dynamic data including heart rate, systolic blood pressure, diastolic blood pressure, pulse pressure, respiration rate and blood oxygen saturation for the first 4h. 253 out of them died within 24h after entering in ICU, after 48h the number increased to 515.
Notably, there is a clear separation between people who died in ICU vs the ones who did not, at the starting point
rm(list = ls())
library(data.table) # For fast loading and pre-processing of large datasets
packageVersion("data.table")
library(ggplot2)
library(MASS)
summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE,
conf.interval=.95, .drop=TRUE) {
library(plyr)
# New version of length which can handle NA's: if na.rm==T, don't count them
length2 <- function (x, na.rm=FALSE) {
if (na.rm) sum(!is.na(x))
else length(x)
}
# This does the summary. For each group's data frame, return a vector with
# N, mean, and sd
datac <- ddply(data, groupvars, .drop=.drop,
.fun = function(xx, col) {
c(N = length2(xx[[col]], na.rm=na.rm),
mean = mean (xx[[col]], na.rm=na.rm),
sd = sd (xx[[col]], na.rm=na.rm)
)
},
measurevar
)
# Rename the "mean" column
datac <- rename(datac, c("mean" = measurevar))
datac$se <- datac$sd / sqrt(datac$N) # Calculate standard error of the mean
# Confidence interval multiplier for standard error
# Calculate t-statistic for confidence interval:
# e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
ciMult <- qt(conf.interval/2 + .5, datac$N-1)
datac$ci <- datac$se * ciMult
return(datac)
}
summarySEmedian <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE,
conf.interval=.95, .drop=TRUE) {
library(plyr)
# New version of length which can handle NA's: if na.rm==T, don't count them
length2 <- function (x, na.rm=FALSE) {
if (na.rm) sum(!is.na(x))
else length(x)
}
# This does the summary. For each group's data frame, return a vector with
# N, mean, and sd
datac <- ddply(data, groupvars, .drop=.drop,
.fun = function(xx, col) {
c(N = length2(xx[[col]], na.rm=na.rm),
median = median (xx[[col]], na.rm=na.rm),
sd = sd (xx[[col]], na.rm=na.rm)
)
},
measurevar
)
# Rename the "mean" column
datac <- rename(datac, c("median" = measurevar))
datac$se <- datac$sd / sqrt(datac$N) # Calculate standard error of the mean
# Confidence interval multiplier for standard error
# Calculate t-statistic for confidence interval:
# e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
ciMult <- qt(conf.interval/2 + .5, datac$N-1)
datac$ci <- datac$se * ciMult
return(datac)
}
mergeDynChart <- function(rawDF, dynvariable, na.action = pass, na.rm=FALSE,
all=TRUE) {
mergeDF <- merge(rawDF[, c("icustay_id", "subject_id", "itemid", "charttime", "valuenum", "valueuom")], PatientInfo, by = c("icustay_id", "subject_id"), all = TRUE )
mergeDF$time <- round(unclass(difftime(as.POSIXlt(mergeDF$charttime), as.POSIXlt(mergeDF$intime), units="hours")))
mergeHours <- aggregate(as.numeric(valuenum) ~ icustay_id + subject_id + death_icu + time, data = mergeDF, FUN = "mean", na.rm=FALSE, na.action = na.pass)
names(mergeHours) <- c("icustay_id", "subject_id", "death_icu", "time", deparse(substitute(dynvariable)))
mergeHours <- mergeHours[mergeHours$time >= 0,]
mergeHours[5] <- round(mergeHours[5], 2)
results <- list("mergeDF" = mergeDF, "mergeHours" = mergeHours)
return(results)
}
mergeDynLab <- function(rawDF, dynvariable, na.action = pass, na.rm=FALSE,
all=TRUE) {
mergeDF <- merge(rawDF[, c("icustay_id", "subject_id", "itemid", "charttime", "valuenum", "valueuom")], PatientInfo, by = c("subject_id"), all = TRUE )
mergeDF$time <- round(unclass(difftime(as.POSIXlt(mergeDF$charttime), as.POSIXlt(mergeDF$intime), units="hours")))
mergeHours <- aggregate(as.numeric(valuenum) ~ icustay_id + subject_id + death_icu + time, data = mergeDF, FUN = "mean", na.rm=FALSE, na.action = na.pass)
names(mergeHours) <- c("icustay_id", "subject_id", "death_icu", "time", deparse(substitute(dynvariable)))
mergeHours <- mergeHours[mergeHours$time >= 0,]
mergeHours[5] <- round(mergeHours[5], 2)
results <- list("mergeDF" = mergeDF, "mergeHours" = mergeHours)
return(results)
}
StatisticalRange <- function(value, alpha=0.01){
library(MASS)
x <- value
N <- length(value)
aux <- boxcox( lm(x ~ 1) )
lambda <- aux$x[which(aux$y == max(aux$y))]
x1 <- (x^lambda - 1)/lambda # (with x1 <- log(x) if lambda = 0)
lowertrans <- qnorm(alpha/N, mean = mean(x1), sd = sd(x1))
uppertrans <- qnorm(1-alpha/N, mean = mean(x1), sd = sd(x1))
lower <- (lowertrans*lambda + 1)^(1/lambda)
upper <- (uppertrans*lambda + 1)^(1/lambda)
results <- list( "lower" = round(lower, 2), "upper" = round(upper,2))
return(results)
}
basicAnalysis <- function(mergeHours, dynvariable, variablefreetext="HR" ) {
died1 <- mergeHours[mergeHours$death_icu == 1, ]
died0 <- mergeHours[mergeHours$death_icu == 0, ]
#summary(died1)
#summary(died0)
BasicBoxplot <- boxplot(died1[,5], died0[,5],
names = c("Died in ICU", "Did not die in ICU"),
main = paste(variablefreetext, "stratified by death in ICU"))
results <- list("died1" = died1, "died0" = died0, "BasicBoxplot" = BasicBoxplot)
return(results)
}
medianValuesPlot <- function(died1, died0, dynvariable, hours=72, na.rm = TRUE){
vartext <- deparse(substitute(dynvariable))
median1 <- summarySEmedian(died1, measurevar=vartext, groupvars=c("time"), na.rm = TRUE)
median0 <- summarySEmedian(died0, measurevar=vartext, groupvars=c("time"), na.rm = TRUE)
names(median1) <- c("time", "N", "var", "sd", "se", "ci")
names(median0) <- c("time", "N", "var", "sd", "se", "ci")
#All times
plotAlltimes0 <- ggplot(median0, aes(x=time, y=var)) +
geom_errorbar(aes(ymin=var-se, ymax=var+se), width=.1) +
ggtitle(paste(vartext, "(died = 0)"))
plotAlltimes1 <- ggplot(median1, aes(x=time, y=var)) +
geom_errorbar(aes(ymin=var-se, ymax=var+se), width=.1) +
ggtitle(paste(vartext, "(died = 1)"))
plot <- ggplot() +
geom_line(data = median1[median1$time <= hours, ], aes(x=time, y=var), color = "blue") +
geom_line(data = median0[median0$time <= hours, ], aes(x=time, y=var)) +
theme_minimal() +
theme(plot.title = element_text(size = 14, family = "Tahoma", face = "bold", hjust = 0.5),
text = element_text(size = 12, family = "Tahoma"),
axis.title = element_text(face="bold"),
axis.text.x=element_text(size = 11),
legend.position = "bottom") +
ggtitle(paste(vartext,"after", hours, "hours h in ICU"))
results <- list("median1" = median1, "median0" = median0,
"plotAlltimes0" = plotAlltimes0, "plotAlltimes1" = plotAlltimes0, "plotMedian" = plot)
return(results)
}
PlotMaxMin <- function(died1, died0, dynvariable, hours=360){
names(died1) <- c("icustay_id","subject_id", "death_icu", "time", "var")
names(died0) <- c("icustay_id","subject_id", "death_icu", "time", "var")
max_1 <- aggregate(var ~ time, data = died1, FUN = "max")
min_1 <- aggregate(var ~ time, data = died1, FUN = "min")
max_0 <- aggregate(var ~ time, data = died0, FUN = "max")
min_0 <- aggregate(var ~ time, data = died0, FUN = "min")
plot1 <- ggplot() +
geom_line(data = max_1[max_1$time < hours,], aes(x=time, y=var), color = "blue") +
geom_line(data = min_1[min_1$time < hours,], aes(x=time, y=var), color = "blue") +
geom_line(data = max_0[max_0$time < hours,], aes(x=time, y=var), color = "black") +
geom_line(data = min_0[min_0$time < hours,], aes(x=time, y=var), color = "black") +
theme_minimal() +
labs(title = paste("Maximum and minimum", dynvariable, "values in the first",hours ,"hours in ICU"),
x="Time (h)")
results <- list("plotMaxMin" = plot1 )
return(results)
}
SpaguettiPlot <- function(died1, died0, variablefreetext, patients=50, hours=360){
names(died1) <- c("icustay_id","subject_id", "death_icu", "time", "var")
names(died0) <- c("icustay_id","subject_id", "death_icu", "time", "var")
# random sample
set.seed(150)
indexes1 = sample(died1$icustay_id, size=patients)
SpagPlot1 <-died1[died1$icustay_id %in%indexes1 & died1$time < hours,]
set.seed(151)
indexes0 = sample(died0$icustay_id, size=patients)
SpagPlot0 <-died0[died0$icustay_id %in%indexes0 & died0$time < hours,]
interaction.plot(SpagPlot1$time, SpagPlot1$icustay_id, SpagPlot1$var,
xlab = "Time",
main=(paste(variablefreetext, "in", patients, "patients who died in ICU")),
legend = F)
interaction.plot(SpagPlot0$time, SpagPlot0$icustay_id, SpagPlot0$var,
xlab = "Time",
main=(paste(variablefreetext, "in", patients, "patients who did not die in ICU")) ,
legend = F)
}
BoxPlot4h <- function(died0, died1, hours, dynvariable, ylab=200){
names(died1) <- c("icustay_id","subject_id", "death_icu", "time", "var")
names(died0) <- c("icustay_id","subject_id", "death_icu", "time", "var")
names(hours) <- c("icustay_id","subject_id","death_icu","time","var")
died00 <- length(unique(died0[died0$time ==0,]$icustay_id))
died01 <- length(unique(died0[died0$time ==1,]$icustay_id))
died02 <- length(unique(died0[died0$time ==2,]$icustay_id))
died03 <- length(unique(died0[died0$time ==3,]$icustay_id))
died04 <- length(unique(died0[died0$time ==4,]$icustay_id))
died10 <- length(unique(died1[died1$time ==0,]$icustay_id))
died11 <- length(unique(died1[died1$time ==1,]$icustay_id))
died12 <- length(unique(died1[died1$time ==2,]$icustay_id))
died13 <- length(unique(died1[died1$time ==3,]$icustay_id))
died14 <- length(unique(died1[died1$time ==4,]$icustay_id))
BoxPlot <- ggplot() +
geom_boxplot(data = hours[hours$time==0,], aes(x=time, y=var, fill = as.factor(death_icu)), alpha=0.7) +
geom_boxplot(data = hours[hours$time==1,], aes(x=time, y=var, fill = as.factor(death_icu)), alpha=0.7) +
geom_boxplot(data = hours[hours$time==2,], aes(x=time, y=var, fill = as.factor(death_icu)), alpha=0.7) +
geom_boxplot(data = hours[hours$time==3,], aes(x=time, y=var, fill = as.factor(death_icu)), alpha=0.7) +
geom_boxplot(data = hours[hours$time==4,], aes(x=time, y=var, fill = as.factor(death_icu)), alpha=0.7) +
theme_minimal() +
theme(plot.title = element_text(size = 14, family = "Tahoma", face = "bold", hjust = 0.5),
text = element_text(size = 12, family = "Tahoma"),
axis.title = element_text(face="bold"),
axis.text.x=element_text(size = 11),
legend.position = "bottom") +
ggtitle(paste("Boxplot of", dynvariable, " in the first 4 hours")) +
scale_x_discrete(name ="Time (hours)", limits = c("1", "2", "3", "4")) +
annotate("text", x=-0.25, y=ylab, label= paste("n=", died00), size = 3.2) +
annotate("text", x=0.25, y=ylab, label= paste("n=", died10), size = 3.2) +
annotate("text", x=0.75, y=ylab, label= paste("n=", died01), size = 3.2) +
annotate("text", x=1.25, y=ylab, label= paste("n=", died11), size = 3.2) +
annotate("text", x=1.75, y=ylab, label= paste("n=", died02),size = 3.2) +
annotate("text", x=2.25, y=ylab, label= paste("n=", died12), size = 3.2) +
annotate("text", x=2.75, y=ylab, label= paste("n=", died03), size = 3.2) +
annotate("text", x=3.25, y=ylab, label= paste("n=", died13), size = 3.2) +
annotate("text", x=3.75, y=ylab, label= paste("n=", died04), size = 3.2) +
annotate("text", x=4.25, y=ylab, label= paste("n=", died14), size = 3.2)
results <- list("BoxPlot" = BoxPlot)
return(results)
}
NAimputation <- function(Df, col){
for (i in unique(Df$icustay_id)){
if (is.na(Df[Df$icustay_id == i & Df$time == 0,][col])){
Df[Df$icustay_id == i & Df$time == 0,][col] <- Df[Df$icustay_id == i & Df$time == 1,][col]
}
if (is.na(Df[Df$icustay_id == i & Df$time == 3,][col])){
Df[Df$icustay_id == i & Df$time == 3,][col] <- Df[Df$icustay_id == i & Df$time == 2,][col]
}
if (is.na(Df[Df$icustay_id == i & Df$time == 1,][col])){
average_value1 <- (Df[Df$icustay_id == i & Df$time == 0,][col] + Df[Df$icustay_id == i & Df$time == 2,][col])/2
Df[Df$icustay_id == i & Df$time == 1,][col] <- average_value1
}
if (is.na(Df[Df$icustay_id == i & Df$time == 2,][col])){
average_value2 <- (Df[Df$icustay_id == i & Df$time == 1,][col] + Df[Df$icustay_id == i & Df$time == 3,][col])/2
Df[raw_4H2$icustay_id == i & Df$time == 2,][col] <- average_value2
}
}
return(Df)
}
The datasets were downloaded from physionet.org as compressed files, and were decompressed before building the database (after decompression: 46.58 GB).
The MIMICIII database was built following this tutorial, which creates first the tables and then add the data.
To get familiarise with the database and SQL query language, I completed the tutorial in the MIMICIII official website.
For example, this query counts the number of patients who died in the hospital:
SELECT expire_flag, COUNT(*)
FROM patients
GROUP BY expire_flag;
ITEMID, which is an identifier for a single measurement type in the database. Each row associated with one ITEMID (e.g. 212) corresponds to an instantiation of the same measurement (e.g. heart rate).There are some important considerations:
D_ITEMS is sourced from two distinct ICU databases. The main consequence is that there are duplicate ITEMID for each concept. For example, heart rate is captured both as an ITEMID of 212 (CareVue) and as an ITEMID of 220045 (Metavision). As a result, it is necessary to search for multiple ITEMID to capture a single concept across the entire database. All Metavision ITEMIDs will have a value > 220000.
Another source of duplicate ITEMID is due to the free text nature of data entry in CareVue - as a result there are additional ITEMID which correspond to misspellings or synonymous descriptions of a single concept. It is important to search for all possible abbreviations and descriptions of a concept to capture all associated ITEMID.
The ITEMID for the variables of interest were manually look up in the database:
SELECT *
FROM d_items
WHERE label LIKE "heart rate";
The query resulted in 2 rows: 211 and 220045, as indicated above. It is measured in bpm (just for the metavision records).
SELECT *
FROM d_items
WHERE label LIKE "%ph%"
ORDER BY label;
The query resulted in 433 rows, and the following were manually selected: pH (1673, 7459) pH arterial(4753, 223830, 780, 1126), pH (cap?) (4202), pH dipstick (220734), pH other (3839), pH SOFT (228243), pH venous (220274, 860), pH level (6003). In case we also want to include urine pH (6754, 1352, 1495, 1880, 17262).
Pulse pressure. It is the difference between the systolic and diastolic blood pressure. It is measured in millimeters of mercury (mmHg). Therefore, the diastolic pressure in addition to the systolic pressure needs to be extracted.
Respiration rate.
SELECT *
FROM d_items
WHERE label LIKE "%respiratory%"
ORDER BY label;
SELECT *
FROM d_items
WHERE label LIKE "%respiration%"
ORDER BY label;
The query results in 18 rows, and the following are manually selected: repiratory rate coded as 618 , 220210 and 224690 (Respiratory Rate (Total)), 228234 (PAR-Respiration). Note there are additional items such as respiratory rate set, when is controlled by the respiratory centre (224688, 619) or respiratory rate spontaneous (224689).
SELECT *
FROM d_items
WHERE label LIKE "%oxygen%"
ORDER BY label;
The query results in 7 rows and PAR-Oxygen saturation is manually selected (228232).
SELECT *
FROM d_items
WHERE label LIKE "%saturation%"
ORDER BY label;
The query results in 10 rows and the manual selection includes: PAR-Oxygen Saturation (228232), systemic vascular resistance(SVR) %O2 Saturation (226865), right atrial(RA) %O2 Saturation (226860), peripherial vascular resistance (PVR) %O2 Saturation (226863), Post Anesthetic Recovery (PAR) %O2 Saturation (228232), arterial partial pressure (PA) %O2 Saturation (226862), pulse oxeginity saturation (220277), arterial O2 Saturation (220227) and ART %O2 saturation (226861).
SELECT *
FROM d_items
WHERE label LIKE "%systolic%"
ORDER BY label;
The query returns 27 rows, from which arterial blood pressure systolic is selected: 225309, 220050, 6071, 51. Also, manual measurements: manual blood pressure systolic left (224167) and manual blood pressure systolic right (227243).
SELECT *
FROM d_items
WHERE label LIKE "%temperature%"
ORDER BY label;
The query results in 16 rows and the manual selection included: temperature In F (679,678, 223761) and Celsius (677, 676,223762), and score based on ApacheIV (227054).
SELECT *
FROM d_items
WHERE label LIKE "%white%"
ORDER BY label;
The query returns only one row: WhiteBloodC 4.0-11.0 (3834). These are in LABEVENTS.
Johnson et al (2017) made available online the code used to run their analysis. It includes information about ITEMID for heart rate, systolic blood pressure, temperature, white blood cell count and respiration rate (this last one only for the carevue records). Therefore, heart rate, systolic blood pressure, temperature and white blood cell count records are extracted here with the same ITEMID. As for respiration rate, we also included metavision values.
Extracting the dynamic data for our cohort. This include the 31,545 patients as defined in DescMIMICIII.Rmd.
The static data is stored as StaticCohort.csv, in which patients are identified with the icustay_id. This column is extracted separated by commas so it can be used to query in MySQL. In bash:
tail -n+2 StaticCohort.csv | cut -f1 | tr "\n" "," > StaticCohortID.txt
In the next steps, #list of icustay_id are the 31,545 icustay_id stored in StaticCohortID.txt.
SELECT subject_id, hadm_id, icustay_id, itemid, charttime, storetime, valuenum, valueuom
FROM chartevents
WHERE icustay_id IN (#list of icustay_id)
AND itemid IN
(
211 -- Heart Rate
, 220045 -- Heart Rate
);
Results saved as StaticCohortHR.csv. To double-check if the extract was done okay, the number of patients are extracted in bash (should be 25,145), and the first 6 lines shown:
cat StaticCohortHR.csv | cut -d"," -f1 | sort -u | wc -l
head StaticCohortHR.csv
Then it is compressed as follows:
cat StaticCohortHR.csv | gzip > StaticCohortHR.csv.gz
SELECT subject_id, hadm_id, icustay_id, itemid, charttime, storetime, valuenum, valueuom
FROM chartevents
WHERE icustay_id IN (#list of icustay_id)
AND itemid IN
(
6 -- ABP [Systolic]
, 51 -- Arterial BP [Systolic]
, 455 -- NBP [Systolic]
, 6701 -- Arterial BP #2 [Systolic]
, 220050 -- Arterial Blood Pressure systolic
, 220179 -- Non Invasive Blood Pressure systolic
);
Results saved as StaticCohortSBP.csv.
SELECT subject_id, hadm_id, icustay_id, itemid, charttime, storetime, valuenum, valueuom
FROM chartevents
WHERE icustay_id IN (#list of icustay_id)
AND itemid IN
(
676 -- Temperature C
, 677 -- Temperature C (calc)
, 678 -- Temperature F
, 679 -- Temperature F (calc)
, 223761 -- Temperature Fahrenheit
, 223762 -- Temperature Celsius
);
Results saved as StaticCohortTEMP.csv.
LABEVENTS chart, which uses SUBJECT_ID instead of ICUSTAY_ID to identify the patients. Therefore, first the SUBJECT_IDs for our cohort are extracted based on the ICUSTAY_IDs, and used to get the white blood cell count measuremet.SELECT subject_id, icustay_id
FROM icustays
WHERE icustay_id IN (#list of icustay_id);
The query is exported as StaticCohortSUBJECTID.csv. Then, the SUBJECTID in bash:
tail -n+2 StaticCohortPATIENTID.csv | cut -d "," -f1 | tr "\n" "," > StaticCohortSUBJECTID.txt
In the next steps, #list of subject_id are the 31,545 subject_id stored in StaticCohortSUBJECTID.txt.
SELECT subject_id, itemid, charttime, valuenum, valueuom
FROM labevents
WHERE subject_id IN (#list of subject_id)
AND itemid IN
(
51300 -- WBC
, 51301 -- White Blood Cells
);
Results saved as StaticCohortWBC.csv.
SELECT subject_id, hadm_id, icustay_id, itemid, charttime, storetime, valuenum, valueuom
FROM icustays
WHERE icustay_id IN (#list of icustay_id);
AND itemid IN
(
618 -- Respiratory Rate
, 615 -- Resp Rate (Total)
, 220210 -- Respiratory Rate
, 224690 -- Respiratory Rate (Total)
)
Results saved as StaticCohortRR.csv. Then zip bash cat StaticCohortHR.csv | gzip > StaticCohortHR.csv.gz
ITEMID was manually selected. The query resulted in 433 rows, and the following were manually selected: pH (1673, 7459) pH arterial(4753, 223830, 780, 1126), pH (cap?) (4202), pH dipstick (220734), pH other (3839), pH SOFT (228243), pH venous (220274, 860), pH level (6003). In case we also want to include urine pH (6754, 1352, 1495, 1880, 17262).SELECT subject_id, hadm_id, icustay_id, itemid, charttime, storetime, valuenum, valueuom
FROM labevents
WHERE icustay_id IN (#list of subject_id)
AND itemid IN
(
1673 -- PH
, 7459 -- Ph
, 4753 -- pH (Art)
, 223830 -- PH (Arterial)
, 780 -- Arterial pH
, 1126 -- Art.pH
, 220274 -- PH (Venous)
, 860 -- Venous pH
);
Results saved as StaticCohortPH.csv.
SELECT subject_id, hadm_id, icustay_id, itemid, charttime, storetime, valuenum, valueuom
FROM icustays
WHERE icustay_id IN (#list of icustay_id)
AND itemid IN
(
646 -- SpO2
, 220277 -- SpO2
, 834 -- SaO2
, 220227 -- Arterial O2 Saturation (SaO2)
);
Results saved as StaticCohortO2S.csv.
ITEMID are chosen as defined by Johnson et al (2016).SELECT subject_id, hadm_id, icustay_id, itemid, charttime, storetime, valuenum, valueuom
FROM chartevents
WHERE icustay_id IN (#list of icustay_id)
AND itemid IN
(
8368 -- Arterial BP [Diastolic]
, 8440 -- Manual BP [Diastolic]
, 8441 -- NBP [Diastolic]
, 8555 -- Arterial BP #2 [Diastolic]
, 220180 -- Non Invasive Blood Pressure diastolic
, 220051 -- Arterial Blood Pressure diastolic
);
Results saved as StaticCohortDBP.csv.
To extract the outtime information (it should match the death time for patients who died in ICU, )
SELECT subject_id, icustay_id, intime, outtime
FROM icustays
WHERE icustay_id IN (#list of icustay_id);
The results are exported as StaticCohortTIME.csv
In bash, we extracted the death status for the icustay_id
cat StaticCohort.csv | cut -f1,4 > DeathStatusSC.csv
# Reading the expression counts
olddir <- getwd()
setwd(data.path)
# Load IcustayID, SubjectID, INTIME
PatientTIME <- fread(input = 'cat < /Users/maria/Desktop/MIMIC-III/mappingID/StaticCohortTIME.csv ')
# Load Death Status in ICU ( in bash: cat StaticCohort.csv | cut -f1,4 > DeathStatusSC.csv)
DeathICU <- fread(input = 'cat < /Users/maria/Desktop/MIMIC-III/mappingID/DeathStatusSC.csv ')
#Merge intime with death status
PatientInfo <- merge(DeathICU, PatientTIME, by = c("icustay_id"))
# Load dynamic data
DfHR <- fread(input = 'zcat < StaticCohortHR.csv.gz')
DfDBP <- fread(input = 'zcat < StaticCohortDBP.csv.gz')
DfO2S <- fread(input = 'zcat < StaticCohortO2S.csv.gz')
# DfPH <- fread(input = 'zcat < StaticCohortPH.csv.gz')
DfRR <- fread(input = 'zcat < StaticCohortRR.csv.gz')
DfSBP <- fread(input = 'zcat < StaticCohortSBP.csv.gz')
DfTEMP <- fread(input = 'zcat < StaticCohortTEMP.csv.gz')
DfWBC <- fread(input = 'zcat < StaticCohortWBC.csv.gz')
#DfTEMP <- fread(input = 'zcat < Users/maria/Desktop/MIMIC-III/dynamic_dat/StaticCohortTEMP.csv.gz')
# We can extract death from here
setwd(olddir)
We were interested in extracting the data for the following eight common clinical variables: heart rate, pH, pulse pressure, respiration rate, blood oxygen saturation, systolic blood pressure, temperature and white blood cell count.
length(unique(DfHR$icustay_id))
length(unique(DfDBP$icustay_id))
length(unique(DfO2S$icustay_id))
#length(unique(DfPH$icustay_id))
length(unique(DfRR$icustay_id))
length(unique(DfSBP$icustay_id))
length(unique(DfTEMP$icustay_id))
length(unique(DfWBC$subject_id))
# It has text
# unique(DfHR$value)
# all Numeric (or null)
# unique(DfDBP$value)
# unique(DfSBP$value)
# unique(DfRR$value)
# unique(DfTEMP$value)
# unique(DfO2S$value)
# unique(DfWBC$value)
The temperature could be recorded as Celsius or Fahrenheit, therefore, all the measurements were transformed to Celsius.
DfHR$valuenum <- as.numeric(DfHR$valuenum)
## Warning: NAs introduced by coercion
DfSBP$valuenum <- as.numeric(DfSBP$valuenum)
## Warning: NAs introduced by coercion
DfDBP$valuenum <- as.numeric(DfDBP$valuenum)
DfRR$valuenum <- as.numeric(DfRR$valuenum)
## Warning: NAs introduced by coercion
DfTEMP$valuenum <- as.numeric(DfTEMP$valuenum)
## Warning: NAs introduced by coercion
DfO2S$valuenum <- as.numeric(DfO2S$valuenum)
## Warning: NAs introduced by coercion
DfWBC$valuenum <- as.numeric(DfWBC$valuenum)
## Warning: NAs introduced by coercion
# Create a new column in DfTEMP (temp) so all the records are in Celsius.
DfTEMP$temp = DfTEMP$valuenum
DfTEMP[, temp := round(((DfTEMP$valuenum-32) * (5/9)), 1)]
DfTEMP[itemid == 676 |itemid == 677 |itemid == 223762 , temp := round(DfTEMP$valuenum, 1)]
## Warning in `[.data.table`(DfTEMP, itemid == 676 | itemid == 677 | itemid
## == : Supplied 1422996 items to be assigned to 626360 items of column
## 'temp' (796636 unused)
DfTEMP$tempC[DfTEMP$itemid %in% c(676, 677, 223762)] <- DfTEMP$valuenum[DfTEMP$itemid %in% c(676, 677, 223762)]
DfTEMP$tempC[DfTEMP$itemid %in% c(678, 679, 223761)] <- DfTEMP$temp[DfTEMP$itemid %in% c(678, 679, 223761)]
# length(unique(DfHR$icustay_id))
# length(unique(DfDBP$icustay_id))
# length(unique(DfO2S$icustay_id))
# #length(unique(DfPH$icustay_id))
# length(unique(DfRR$icustay_id))
# length(unique(DfSBP$icustay_id))
# length(unique(DfTEMP$icustay_id))
# length(unique(DfWBC$subject_id))
Firstly, I check that all the records were measured in the same unit for each dynamic variable.
unique(DfHR$valueuom)
unique(DfDBP$valueuom)
unique(DfSBP$valueuom)
unique(DfRR$valueuom)
unique(DfTEMP$valueuom)
unique(DfO2S$valueuom)
unique(DfWBC$valueuom)
Double check the NULL values:
DfSBP[valueuom == "NULL"]
DfRR[valueuom == "NULL"]
DfTEMP[valueuom == "NULL"]
DfO2S[valueuom == "NULL"]
DfWBC[valueuom == "NULL"]
Several approaches were tested to define reasonable boundaries for each of the dynamic categories:
Domain knowledge (2 clinicians).
Medical guidelines: ICNARC criteria, NHS England.
Johnson et al ( GitHub code).
Statistical approach. Based on the method described in Johnson et al, 2014.
| Domain knowledge | Medical guidelines | Johnson et al (Github) | Statistical approach | |
|---|---|---|---|---|
| HR | 200/40 | 252/0 | 300/0 | 257/22 |
| SBP | 250/50 | 502/0 | 400/0 | 341/33 |
| DBP | 160/0 | 671/0 | 300/0 | 308/5 |
| RR | 40/10 | 4084/0 | 70/0 | 91/2 |
| TEMP | 50/24 | 50/27 | ||
| SpO2 | 100/40 | 11751/0 | 100/0 | 132/70 |
| WBC | 30/0 |
The number of WBC records was very low and therefore, this variable was excluded in the analysis at this point.
# just once for HR
library(MASS)
alpha = 0.01
x <- DfHR[DfHR$valuenum > 0,]$valuenum
N <- length(x)
aux <- boxcox( lm(x~ 1) )
lambda <- aux$x[which(aux$y == max(aux$y))]
x1 <- (x^lambda - 1)/lambda # (with x1 <- log(x) if lambda = 0)
lowertrans <- qnorm(alpha/N, mean = mean(x1), sd = sd(x1))
uppertrans <- qnorm(1-alpha/N, mean = mean(x1), sd = sd(x1))
lower <- (lowertrans*lambda + 1)^(1/lambda)
upper <- (uppertrans*lambda + 1)^(1/lambda)
HR_StatRange <- StatisticalRange(DfHR[DfHR$valuenum > 0,]$valuenum)
SBP_StatRange <- StatisticalRange(DfSBP[DfSBP$valuenum > 0,]$valuenum)
DBP_StatRange <- StatisticalRange(DfDBP[DfDBP$valuenum >= 0,]$valuenum)
O2S_StatRange <- StatisticalRange(DfO2S[DfO2S$valuenum > 0,]$valuenum)
TEMP_StatRange <- StatisticalRange(DfTEMP[DfTEMP$tempC > 0,]$tempC)
WBC_StatRange <- StatisticalRange(DfWBC[DfWBC$valuenum > 0,]$valuenum)
par(mfrow=c(4,2))
hist(DfHR$valuenum, xlim = c(0, 400), breaks = 10000, main= "Distribution HR values")
abline(v = HR_StatRange$lower, col = "red")
abline(v = HR_StatRange$upper, col = "red")
hist(DfSBP$valuenum, xlim = c(0, 400), breaks = 10000, main= "Distribution SBP values")
abline(v = SBP_StatRange$lower, col = "red")
abline(v = SBP_StatRange$upper, col = "red")
hist(DfDBP$valuenum, xlim = c(0, 400), breaks = 10000, main= "Distribution DBP values")
abline(v = DBP_StatRange$lower, col = "red")
abline(v = DBP_StatRange$upper, col = "red")
RR_StatRange <- StatisticalRange(DfRR[DfRR$valuenum > 0,]$valuenum)
hist(DfRR$valuenum, xlim = c(0, 10000), breaks = 1000, main= "Distribution RR values")
abline(v = RR_StatRange$lower, col = "red")
abline(v = RR_StatRange$upper, col = "red")
hist(DfO2S$valuenum, ylim = c(0, 3000000), xlim = c(0, 1000), breaks = 10000, main= "Distribution O2S values")
abline(v = O2S_StatRange$lower, col = "red")
abline(v = O2S_StatRange$upper, col = "red")
hist(DfTEMP$temp, xlim = c(0, 1000), breaks = 10000, main= "Distribution TEMP values")
abline(v = TEMP_StatRange$lower, col = "red")
abline(v = TEMP_StatRange$upper, col = "red")
hist(DfWBC$valuenum, xlim = c(0, 400), breaks = 10000, main= "Distribution WBC values")
abline(v = WBC_StatRange$lower, col = "red")
abline(v = WBC_StatRange$upper, col = "red")
HR_adj <- DfHR[DfHR$valuenum >= 22 & DfHR$valuenum <= 257, ]
SBP_adj <- DfSBP[DfSBP$valuenum >= 33 & DfSBP$valuenum <= 341 , ]
DBP_adj <- DfDBP[DfDBP$valuenum >= 5 & DfDBP$valuenum <= 308, ]
RR_adj <- DfRR[DfRR$valuenum >= 2 & DfRR$valuenum <= 91, ]
TEMP_adj <- DfTEMP[DfTEMP$tempC >= 27 & DfTEMP$tempC <= 50, ]
O2S_adj <- DfO2S[DfO2S$valuenum >= 70 & DfO2S$valuenum <= 132, ]
#WBC_adj <- DfWBC[DfWBC$valuenum > 0 & DfWBC$valuenum < 30, ]
# length(unique(HR_adj$subject_id))
# length(unique(SBP_adj$subject_id))
# length(unique(DBP_adj$subject_id))
# length(unique(RR_adj$subject_id))
# length(unique(TEMP_adj$subject_id))
# length(unique(O2S_adj$subject_id))
# length(unique(WBC_adj$subject_id))
After excluding outliers:
# SBP
SBP_tables <- mergeDynChart(SBP_adj, sbp)
SBP_merge <- SBP_tables$mergeDF
SBP_hours <- SBP_tables$mergeHours
# DBP
DBP_tables <- mergeDynChart(DBP_adj, dbp)
DBP_merge <- DBP_tables$mergeDF
DBP_hours <- DBP_tables$mergeHours
# HR
HR_tables <- mergeDynChart(HR_adj, hr)
HR_merge <- HR_tables$mergeDF
HR_hours <- HR_tables$mergeHours
# O2S
O2S_tables <- mergeDynChart(O2S_adj, o2s)
O2S_merge <- O2S_tables$mergeDF
O2S_hours <- O2S_tables$mergeHours
# RR
RR_tables <- mergeDynChart(RR_adj, rr)
RR_merge <- RR_tables$mergeDF
RR_hours <- RR_tables$mergeHours
# Special case Temperature
TEMP_merge <- merge(TEMP_adj[, c("icustay_id", "subject_id", "itemid", "charttime", "tempC")], PatientInfo, by = c("icustay_id", "subject_id"), all = TRUE )
TEMP_merge$time <- round(unclass(difftime(as.POSIXlt(TEMP_merge$charttime), as.POSIXlt(TEMP_merge$intime), units="hours")))
TEMP_hours <- aggregate(as.numeric(tempC) ~ icustay_id + subject_id + death_icu + time, data = TEMP_merge, FUN = "mean", na.rm=FALSE, na.action = na.pass)
names(TEMP_hours) <- c("icustay_id", "subject_id", "death_icu", "time", "tempC")
TEMP_hours <- TEMP_hours[TEMP_hours$time >= 0,]
TEMP_hours[5] <- round(TEMP_hours[5], 2)
# Special case: WBC
#WBC_tables <- mergeDynLab(WBC_adj, wbc)
#WBC_merge <- WBC_tables$mergeDF
#WBC_hours <- WBC_tables$mergeHours
## SBP
# It should return 25,145
# length(unique(SBP_merge$subject_id))
# length(unique(SBP_merge$icustay_id))
# It should return 25,142
# length(unique(SBP_hours$subject_id))
# length(unique(SBP_hours$icustay_id))
# setdiff(sort(unique(SBP_merge$subject_id)), sort(unique(SBP_hours$subject_id)))
# setdiff(sort(unique(SBP_merge$icustay_id)), sort(unique(SBP_hours$icustay_id)))
# SBP_merge[subject_id %in% c(41874,64590, 73583 ),]
## DBP
# It should return 25,145
# length(unique(DBP_merge$subject_id))
# length(unique(DBP_merge$icustay_id))
# It should return 25,142
# length(unique(DBP_hours$subject_id))
# length(unique(DBP_hours$icustay_id))
# setdiff(sort(unique(DBP_merge$subject_id)), sort(unique(DBP_hours$subject_id)))
# setdiff(sort(unique(DBP_merge$icustay_id)), sort(unique(DBP_hours$icustay_id)))
# DBP_merge[subject_id %in% c(41874,64590, 73583 ),]
## RR
# It should return 25,145
# length(unique(RR_merge$subject_id))
# length(unique(RR_merge$icustay_id))
# It should return 25,131
# length(unique(RR_hours$subject_id))
# length(unique(RR_hours$icustay_id))
# setdiff(sort(unique(RR_merge$subject_id)), sort(unique(RR_hours$subject_id)))
# setdiff(sort(unique(RR_merge$icustay_id)), sort(unique(RR_hours$icustay_id)))
# RR_merge[subject_id %in% c( 1628,6306,6314,9986,12840,13673,15302,15944,16980,20099,21542,24956,44611,97922 ),]
## HR
# It should return 25145
# length(unique(HR_merge$subject_id))
# length(unique(HR_merge$icustay_id))
# It should return 25145
# length(unique(HR_hours$subject_id))
# length(unique(HR_hours$icustay_id))
## O2S
# It should return 25,145
# length(unique(O2S_merge$subject_id))
# length(unique(O2S_merge$icustay_id))
# It should return 25,133
# length(unique(O2S_hours$subject_id))
# length(unique(O2S_hours$icustay_id))
# setdiff(sort(unique(O2S_merge$subject_id)), sort(unique(O2S_hours$subject_id)))
# setdiff(sort(unique(O2S_merge$icustay_id)), sort(unique(O2S_hours$icustay_id)))
# O2S_merge[subject_id %in% c( 1628,4607, 16007, 24256, 49395, 67813, 68439, 70912, 77589, 87174),]
## TEMP
# It should return 25,145
# length(unique(TEMP_merge$subject_id))
# length(unique(TEMP_merge$icustay_id))
# It should return 25,078
# length(unique(TEMP_hours$subject_id))
# length(unique(TEMP_hours$icustay_id))
## WBC
# It should return 25,145 patients
# length(unique(WBC_merge$subject_id))
# length(unique(WBC_merge$icustay_id))
# It should return 24,734
# length(unique(WBC_hours$subject_id))
# length(unique(WBC_hours$icustay_id))
# setdiff(sort(unique(WBC_merge$subject_id)), sort(unique(WBC_hours$subject_id)))
# setdiff(sort(unique(WBC_merge$icustay_id)), sort(unique(WBC_hours$icustay_id)))
# WBC_merge[subject_id %in% setdiff(sort(unique(WBC_merge$subject_id)), sort(unique(WBC_hours$subject_id)))[1:100],]
BasicHR <- basicAnalysis(HR_hours, hr, "Heart rate")
MedianHR360 <- medianValuesPlot(died1=BasicHR$died1, died0=BasicHR$died0, dynvariable=hr, hours=360)
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
PlotMaxMinHR360 <- PlotMaxMin(died1=BasicHR$died1, died0=BasicHR$died0, dynvariable="Heart rate", hours=360)
MedianHR720 <- medianValuesPlot(died1=BasicHR$died1, died0=BasicHR$died0, dynvariable=hr, hours=720)
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
PlotMaxMinHR720 <- PlotMaxMin(died1=BasicHR$died1, died0=BasicHR$died0, dynvariable="Heart rate", hours=720)
par(mfrow=c(2,2))
MedianHR360$plotMedian
MedianHR720$plotMedian
PlotMaxMinHR360$plotMaxMin
PlotMaxMinHR720$plotMaxMin
SpaguettiPlot360HR <- SpaguettiPlot(died1=BasicHR$died1, died0=BasicHR$died0, "Heart Rate", patients=50, hours=360)
par(mfrow=c(1,1))
BoxPlot4hHR <- BoxPlot4h(died0=BasicHR$died0, died1=BasicHR$died1, HR_hours, "HR", ylab=200)
BoxPlot4hHR$BoxPlot
BasicSBP <- basicAnalysis(SBP_hours, sbp, "Systolic Blood Pressure")
MedianSBP360 <- medianValuesPlot(died1=BasicSBP$died1, died0=BasicSBP$died0, dynvariable=sbp, hours=360)
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
PlotMaxMinSBP360 <- PlotMaxMin(died1=BasicSBP$died1, died0=BasicSBP$died0, dynvariable="Systolic Blood Pressure", hours=360)
MedianSBP720 <- medianValuesPlot(died1=BasicSBP$died1, died0=BasicSBP$died0, dynvariable=sbp, hours=720)
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
PlotMaxMinSBP720 <- PlotMaxMin(died1=BasicSBP$died1, died0=BasicSBP$died0, dynvariable="Systolic Blood Pressure", hours=720)
par(mfrow=c(2,2))
MedianSBP360$plotMedian
PlotMaxMinSBP360$plotMaxMin
MedianSBP720$plotMedian
PlotMaxMinSBP720$plotMaxMin
SpaguettiPlot360SBP <- SpaguettiPlot(died1=BasicSBP$died1, died0=BasicSBP$died0, "Systolic Blood Pressure", patients=50, hours=360)
par(mfrow=c(1,1))
BoxPlot4hSBP <- BoxPlot4h(died0=BasicSBP$died0, died1=BasicSBP$died1, SBP_hours, "Systolic Blood Pressure", ylab=300)
BoxPlot4hSBP$BoxPlot
BasicDBP <- basicAnalysis(DBP_hours, sbp, "Diastolic Blood Pressure")
MedianDBP360 <- medianValuesPlot(died1=BasicDBP$died1, died0=BasicDBP$died0, dynvariable=dbp, hours=360)
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
MedianDBP720 <- medianValuesPlot(died1=BasicDBP$died1, died0=BasicDBP$died0, dynvariable=dbp, hours=720)
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
PlotMaxMinDBP360 <- PlotMaxMin(died1=BasicDBP$died1, died0=BasicDBP$died0, dynvariable="Diastolic Blood Pressure", hours=360)
PlotMaxMinDBP720 <- PlotMaxMin(died1=BasicDBP$died1, died0=BasicDBP$died0, dynvariable="Diastolic Blood Pressure", hours=720)
par(mfrow=c(2,2))
MedianDBP360$plotMedian
MedianDBP720$plotMedian
PlotMaxMinDBP360$plotMaxMin
PlotMaxMinDBP720$plotMaxMin
SpaguettiPlot360DBP <- SpaguettiPlot(died1=BasicDBP$died1, died0=BasicDBP$died0, "Diastolic Blood Pressure", patients=50, hours=360)
par(mfrow=c(1,1))
BoxPlot4hDBP <- BoxPlot4h(died0=BasicDBP$died0, died1=BasicDBP$died1, DBP_hours, "Diastolic Blood Pressure", ylab=250)
BoxPlot4hDBP$BoxPlot
BasicRR <- basicAnalysis(RR_hours, rr, "Respiration Rate")
MedianRR360 <- medianValuesPlot(died1=BasicRR$died1, died0=BasicRR$died0, dynvariable=rr, hours=360)
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
MedianRR720 <- medianValuesPlot(died1=BasicRR$died1, died0=BasicRR$died0, dynvariable=rr, hours=720)
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
PlotMaxMinRR360 <- PlotMaxMin(died1=BasicRR$died1, died0=BasicRR$died0, dynvariable="Respiration Rate", hours=360)
PlotMaxMinRR720 <- PlotMaxMin(died1=BasicRR$died1, died0=BasicRR$died0, dynvariable="Respiration Rate", hours=720)
par(mfrow=c(2,2))
MedianRR360$plotMedian
MedianRR720$plotMedian
PlotMaxMinRR360$plotMaxMin
PlotMaxMinRR720$plotMaxMin
par(mfrow=c(2,2))
SpaguettiPlot360RR <- SpaguettiPlot(died1=BasicDBP$died1, died0=BasicDBP$died0, "Respiration Rate", patients=50, hours=360)
par(mfrow=c(1,1))
BoxPlot4hRR <- BoxPlot4h(died0=BasicRR$died0, died1=BasicRR$died1, RR_hours, "Respiration Rate", ylab=100)
BoxPlot4hRR$BoxPlot
BasicO2S <- basicAnalysis(O2S_hours, o2s, "Blood Oxygen Saturation")
MedianO2S360 <- medianValuesPlot(died1=BasicO2S$died1, died0=BasicO2S$died0, dynvariable=o2s, hours=360)
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
MedianO2S720 <- medianValuesPlot(died1=BasicO2S$died1, died0=BasicO2S$died0, dynvariable=o2s, hours=720)
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
PlotMaxMinO2S360 <- PlotMaxMin(died1=BasicO2S$died1, died0=BasicO2S$died0, dynvariable= "Blood Oxygen Saturation", hours=360)
PlotMaxMinO2S720 <- PlotMaxMin(died1=BasicO2S$died1, died0=BasicO2S$died0, dynvariable="Blood Oxygen Saturation", hours=720)
par(mfrow=c(2,2))
MedianO2S360$plotMedian
MedianO2S720$plotMedian
PlotMaxMinO2S360$plotMaxMin
PlotMaxMinO2S720$plotMaxMin
SpaguettiPlot360O2S <- SpaguettiPlot(died1=BasicO2S$died1, died0=BasicO2S$died0, "Blood Oxygen Saturation", patients=50, hours=360)
par(mfrow=c(1,1))
BoxPlot4hO2S <- BoxPlot4h(died0=BasicO2S$died0, died1=BasicO2S$died1, O2S_hours, "Blood Oxygen Saturation", ylab=110)
BoxPlot4hO2S$BoxPlot
BasicTEMP <- basicAnalysis(TEMP_hours, tempC, "Temperature")
MedianTEMP360 <- medianValuesPlot(died1=BasicTEMP$died1, died0=BasicTEMP$died0, dynvariable=tempC, hours=360)
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
MedianTEMP720 <- medianValuesPlot(died1=BasicTEMP$died1, died0=BasicTEMP$died0, dynvariable=tempC, hours=720)
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
PlotMaxMinTEMP360 <- PlotMaxMin(died1=BasicTEMP$died1, died0=BasicTEMP$died0, dynvariable="Temperature", hours=360)
PlotMaxMinTEMP720 <- PlotMaxMin(died1=BasicTEMP$died1, died0=BasicTEMP$died0, dynvariable="Temperature", hours=720)
par(mfrow=c(2,2))
MedianTEMP360$plotMedian
MedianTEMP720$plotMedian
PlotMaxMinTEMP360$plotMaxMin
PlotMaxMinTEMP720$plotMaxMin
par(mfrow=c(2,2))
SpaguettiPlot360TEMP <- SpaguettiPlot(died1=BasicTEMP$died1, died0=BasicTEMP$died0, "Temperature", patients=50, hours=360)
par(mfrow=c(1,1))
BoxPlot4hTEMP <- BoxPlot4h(died0=BasicTEMP$died0, died1=BasicTEMP$died1, TEMP_hours, "Temperature", ylab=50)
BoxPlot4hTEMP$BoxPlot
Systolic Blood Pressure can be measured invasely or non-invasively. In this case, we will run the analysis only taken into account the measurements performed directly in the artery (itemid: 6, 51, 6701, 220050). 14,774 patients were recorded using this itemid.
SBP_arterial <- SBP_merge[SBP_merge$itemid %in% c(6, 51, 6701, 220050),]
SBPA_hours <- aggregate(as.numeric(valuenum) ~ icustay_id + subject_id + death_icu + time, data = SBP_arterial, FUN = "mean", na.rm=FALSE, na.action = na.pass)
names(SBPA_hours) <- c("icustay_id", "subject_id", "death_icu", "time", "sbp")
SBPA_hours <- SBPA_hours[SBPA_hours$time >= 0, ]
SBPA_hours$sbp <- round(SBPA_hours$sbp, 2)
print(" First 6 rows")
## [1] " First 6 rows"
head(SBPA_hours)
## icustay_id subject_id death_icu time sbp
## 1220 263738 13 0 0 151
## 1221 262236 24 0 0 137
## 1222 295037 32 0 0 108
## 1223 274249 45 0 0 143
## 1224 249195 49 0 0 104
## 1225 297831 77 0 0 127
length(unique(SBPA_hours$icustay_id))
## [1] 14770
BasicSBPA <- basicAnalysis(SBPA_hours, sbp, "SBP Arterial")
MedianSBPA360 <- medianValuesPlot(died1=BasicSBPA$died1, died0=BasicSBPA$died0, dynvariable=sbp, hours=360)
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
MedianSBPA720 <- medianValuesPlot(died1=BasicSBPA$died1, died0=BasicSBPA$died0, dynvariable=sbp, hours=720)
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
PlotMaxMinSBPA360 <- PlotMaxMin(died1=BasicSBPA$died1, died0=BasicSBPA$died0, dynvariable="SBP Arterial", hours=360)
PlotMaxMinSBPA720 <- PlotMaxMin(died1=BasicSBPA$died1, died0=BasicSBPA$died0, dynvariable="SBP Arterial", hours=720)
par(mfrow=c(2,2))
MedianSBPA360$plotMedian
MedianSBPA720$plotMedian
PlotMaxMinSBPA360$plotMaxMin
PlotMaxMinSBPA720$plotMaxMin
par(mfrow=c(2,2))
SpaguettiPlot360SBPA <- SpaguettiPlot(died1=BasicSBPA$died1, died0=BasicSBPA$died0, "SBP Arterial", patients=50, hours=360)
par(mfrow=c(1,1))
BoxPlot4hSBPA <- BoxPlot4h(died0=BasicSBPA$died0, died1=BasicSBPA$died1, SBPA_hours, "SBP Arterial", ylab=280)
BoxPlot4hSBPA$BoxPlot
#nrow(SBP_hours[SBP_hours$time < 4, ])
#68571
#nrow(HR_hours[HR_hours$time < 4, ])
#70053
HS_4h <- merge(HR_hours[HR_hours$time < 4, ], SBP_hours[SBP_hours$time < 4, ], by = c("icustay_id", "subject_id", "death_icu", "time"), all = TRUE)
#nrow(HS_4h)
#70299
HSD_4h <- merge(HS_4h, DBP_hours[DBP_hours$time < 4, ], by = c("icustay_id", "subject_id", "death_icu", "time"), all = TRUE)
# Create pulse pressure
HSD_4h$pp <- HSD_4h$sbp - HSD_4h$dbp
#nrow(HSD_4h)
#70299
HSDPR_4h <- merge(HSD_4h, RR_hours[RR_hours$time < 4, ], by = c("icustay_id", "subject_id", "death_icu", "time"), all = TRUE)
HSDPRO_4h <- merge(HSDPR_4h, O2S_hours[O2S_hours$time < 4, ], by = c("icustay_id", "subject_id", "death_icu", "time"), all = TRUE)
HSDPROT_4h <- merge(HSDPRO_4h, TEMP_hours[TEMP_hours$time < 4, ], by = c("icustay_id", "subject_id", "death_icu", "time"), all = TRUE)
#nrow(HSDPROT_4h)
#72350
The total merge dataset has 23,124 patients. However, because many patients have more than 1 missing value for temperature at the different 4 time points, this variable is excluded at this point.
Df4h <- HSDPRO_4h
patients4h <- unique(Df4h$icustay_id)
length(patients4h)
## [1] 23107
i = 1
for (patient in patients4h){
if (length(is.na(Df4h[Df4h$icustay_id == patient & Df4h$time == 0,])) == 0){
new_time <- c(patient,
Df4h[Df4h$icustay_id == patient,]$subject_id[1],
Df4h[Df4h$icustay_id == patient,]$death_icu[1],
0,
NA, NA, NA, NA, NA, NA)
Df4h <- rbind(Df4h, new_time)
}
if (length(is.na(Df4h[Df4h$icustay_id == patient & Df4h$time == 1,])) == 0){
new_time <- c(patient,
Df4h[Df4h$icustay_id == patient,]$subject_id[1],
Df4h[Df4h$icustay_id == patient,]$death_icu[1],
1,
NA, NA, NA, NA, NA, NA)
Df4h <- rbind(Df4h, new_time)
}
if (length(is.na(Df4h[Df4h$icustay_id == patient & Df4h$time == 2,])) == 0){
new_time <- c(patient,
Df4h[Df4h$icustay_id == patient,]$subject_id[1],
Df4h[Df4h$icustay_id == patient,]$death_icu[1],
2,
NA, NA, NA, NA, NA, NA)
Df4h <- rbind(Df4h, new_time)
}
if (length(is.na(Df4h[Df4h$icustay_id == patient & Df4h$time == 3,])) == 0){
new_time <- c(patient,
Df4h[Df4h$icustay_id == patient,]$subject_id[1],
Df4h[Df4h$icustay_id == patient,]$death_icu[1],
3,
NA, NA, NA, NA, NA, NA)
Df4h <- rbind(Df4h, new_time)
}
}
nrow(HSDPROT_4h)
## [1] 72350
nrow(Df4h)
## [1] 92428
length(unique(HSDPROT_4h$icustay_id))
## [1] 23124
length(unique(Df4h$icustay_id))
## [1] 23107
After completing the table with all times: 92496 rows.
drop_patients <- list()
for (patient in patients4h) {
if (length(which(is.na(Df4h[Df4h$icustay_id == patient,]$hr))) > 1 ||
length(which(is.na(Df4h[Df4h$icustay_id == patient,]$sbp))) > 1 ||
length(which(is.na(Df4h[Df4h$icustay_id == patient,]$dbp))) > 1 ||
length(which(is.na(Df4h[Df4h$icustay_id == patient,]$pp))) > 1 ||
length(which(is.na(Df4h[Df4h$icustay_id == patient,]$rr))) > 1 ||
length(which(is.na(Df4h[Df4h$icustay_id == patient,]$o2s))) > 1)
{
drop_patients[i] <- patient
i = i + 1
#Df4h[-Df4h$icustay_id == patient,]
#print(head(Df4h))
#print(patient)
}
}
length(drop_patients)
## [1] 8363
prueba <- Df4h[!Df4h$icustay_id %in% drop_patients,]
length(unique(prueba$icustay_id))
## [1] 14744
raw_4H <- prueba[order(prueba$icustay_id, prueba$time),]
Patients with data for the first 4h: 23,124. Number of patients with more than 1 missing value for a variable: 8,435. Number of patients in the final dataset: 14,744.
The missing values were imputted based on the mean between data point before and after the missing value (ie. 2 and 3), or by duplicating the value after (the case for the first value), or duplicating the value before (the case for the fourth value).
Summary of missing values: - HR: 5483 rows - SBP: 5984 rows - DBP: 5988 rows - PP: 5992 rows - RR: 5422 rows - O2S: 5903 rows
summary(raw_4H)
## icustay_id subject_id death_icu time
## Min. :200003 Min. : 3 Min. :0.0000 Min. :0.00
## 1st Qu.:225192 1st Qu.:13528 1st Qu.:0.0000 1st Qu.:0.75
## Median :249887 Median :27134 Median :0.0000 Median :1.50
## Mean :249985 Mean :37261 Mean :0.1069 Mean :1.50
## 3rd Qu.:274880 3rd Qu.:61364 3rd Qu.:0.0000 3rd Qu.:2.25
## Max. :299992 Max. :99999 Max. :1.0000 Max. :3.00
##
## hr sbp dbp pp
## Min. : 26.00 Min. : 47.0 Min. : 10.00 Min. :-162.00
## 1st Qu.: 73.00 1st Qu.:105.5 1st Qu.: 53.00 1st Qu.: 46.00
## Median : 84.00 Median :119.2 Median : 62.00 Median : 56.00
## Mean : 86.28 Mean :121.7 Mean : 63.29 Mean : 58.44
## 3rd Qu.: 98.00 3rd Qu.:136.0 3rd Qu.: 72.00 3rd Qu.: 69.00
## Max. :187.33 Max. :254.2 Max. :282.00 Max. : 166.00
## NA's :5483 NA's :5984 NA's :5988 NA's :5992
## rr o2s
## Min. : 2.00 Min. : 70.00
## 1st Qu.:14.50 1st Qu.: 96.00
## Median :17.67 Median : 98.25
## Mean :18.40 Mean : 97.61
## 3rd Qu.:21.00 3rd Qu.:100.00
## Max. :91.00 Max. :100.00
## NA's :5422 NA's :5903
raw_4H2 <- raw_4H
## columns:
# HR - 5
# SBP - 6
# DBP - 7
# PP - 8
# RR - 9
# O2S - 10
raw_4H3 <- NAimputation(raw_4H2, 5)
raw_4H4 <- NAimputation(raw_4H3, 6)
raw_4H5 <- NAimputation(raw_4H4, 7)
raw_4H6 <- NAimputation(raw_4H5, 8)
raw_4H7 <- NAimputation(raw_4H6, 9)
raw_4H8 <- NAimputation(raw_4H7, 10)
nrow(raw_4H8)
## [1] 58976
length(unique(raw_4H8$icustay_id))
## [1] 14744
summary(raw_4H8)
## icustay_id subject_id death_icu time
## Min. :200003 Min. : 3 Min. :0.0000 Min. :0.00
## 1st Qu.:225192 1st Qu.:13528 1st Qu.:0.0000 1st Qu.:0.75
## Median :249887 Median :27134 Median :0.0000 Median :1.50
## Mean :249985 Mean :37261 Mean :0.1069 Mean :1.50
## 3rd Qu.:274880 3rd Qu.:61364 3rd Qu.:0.0000 3rd Qu.:2.25
## Max. :299992 Max. :99999 Max. :1.0000 Max. :3.00
## hr sbp dbp pp
## Min. : 26.00 Min. : 47.0 Min. : 10.00 Min. :-162.00
## 1st Qu.: 73.00 1st Qu.:106.0 1st Qu.: 53.50 1st Qu.: 46.00
## Median : 84.00 Median :120.0 Median : 62.00 Median : 56.33
## Mean : 86.26 Mean :122.1 Mean : 63.61 Mean : 58.53
## 3rd Qu.: 98.00 3rd Qu.:136.0 3rd Qu.: 72.00 3rd Qu.: 69.00
## Max. :187.33 Max. :254.2 Max. :282.00 Max. : 166.00
## rr o2s
## Min. : 2.00 Min. : 70.00
## 1st Qu.:14.50 1st Qu.: 96.00
## Median :17.50 Median : 98.40
## Mean :18.34 Mean : 97.63
## 3rd Qu.:21.00 3rd Qu.:100.00
## Max. :91.00 Max. :100.00
The final dataset after imputation: 58,976 rows, 14,744 patients
Saving the data frame
#save(raw_4H8,
# file = "/Users/maria/Desktop/MIMIC-III/dynamic_data/DynamicCohort4H.Rda")
# write.table(raw_4H8,
# file = "/Users/maria/Desktop/MIMIC-III/dynamic_data/DynamicCohort4H.csv",
# row.names=FALSE, na="",col.names=TRUE, sep="\t")
# write.table(SBP_hours,
# file = "/Users/maria/Desktop/MIMIC-III/dynamic_data/DynamicSBP_hours.csv",
# row.names=FALSE, na="",col.names=TRUE, sep="\t")
#
# write.table(HR_hours,
# file = "/Users/maria/Desktop/MIMIC-III/dynamic_data/DynamicHR_hours.csv",
# row.names=FALSE, na="",col.names=TRUE, sep="\t")
#
# write.table(DBP_hours,
# file = "/Users/maria/Desktop/MIMIC-III/dynamic_data/DynamicDBP_hours.csv",
# row.names=FALSE, na="",col.names=TRUE, sep="\t")
#
# write.table(RR_hours,
# file = "/Users/maria/Desktop/MIMIC-III/dynamic_data/DynamicRR_hours.csv",
# row.names=FALSE, na="",col.names=TRUE, sep="\t")
#
# write.table(O2S_hours,
# file = "/Users/maria/Desktop/MIMIC-III/dynamic_data/DynamicO2S_hours.csv",
# row.names=FALSE, na="",col.names=TRUE, sep="\t")
sessionInfo()
## R version 3.4.2 (2017-09-28)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] plyr_1.8.4 MASS_7.3-47 ggplot2_2.2.1
## [4] data.table_1.10.4-3
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.14 knitr_1.18 magrittr_1.5 munsell_0.4.3
## [5] colorspace_1.3-2 rlang_0.2.0 stringr_1.2.0 tools_3.4.2
## [9] grid_3.4.2 gtable_0.2.0 htmltools_0.3.6 yaml_2.1.16
## [13] lazyeval_0.2.1 rprojroot_1.3-2 digest_0.6.12 tibble_1.4.2
## [17] evaluate_0.10.1 rmarkdown_1.8 labeling_0.3 stringi_1.1.6
## [21] compiler_3.4.2 pillar_1.1.0 scales_0.5.0 backports_1.1.2